Combining Simulation Results

library(tidyverse)

read_files <- function (dir) {
  setwd(dir)
  files <- list.files(pattern="_output_")
  
  mylist <- lapply(files, function(x) {
    load(file = x)
    get(ls()[ls()!= "filename"])
  })
  
  all <- do.call("bind_rows", mylist)
  return(all)
}

#load in files
all <- read_files("~/Desktop/Cluster/MLSims_Revisions/23MayResults")

nrow(all) #to make sure every iteration ran
## [1] 4981
nrow(all %>% select(-iteration) %>% distinct()) 
## [1] 4981
length(unique(all$seed)) #to make sure there are no duplicates
## [1] 4981
#checking number of iterations
table(all$scenario)
## 
##   1a   1b    2 
## 2321 2075  585
all <- all %>%
  mutate(scenario = factor(scenario, levels=c("1a","1b","2")),
         sc_combo = paste(K, ns, cov_shift, study_sd, study_inter_sd, scenario, sep="_"),
         sc_combo = factor(sc_combo))

table(all$sc_combo)
## 
## 10_half and half_no_1_0.5_1a 10_half and half_no_1_0.5_1b 
##                          188                          192 
##  10_half and half_no_NA_NA_2     10_one large_no_1_0.5_1a 
##                          194                          197 
##     10_one large_no_1_0.5_1b      10_one large_no_NA_NA_2 
##                          194                          190 
##    10_one large_yes_1_0.5_1a    10_one large_yes_1_0.5_1b 
##                          187                          218 
##          10_same_no_0.5_0_1a          10_same_no_0.5_0_1b 
##                          299                          201 
##            10_same_no_1_0_1a            10_same_no_1_0_1b 
##                          292                          209 
##          10_same_no_1_0.5_1a          10_same_no_1_0.5_1b 
##                          252                          204 
##            10_same_no_1_1_1a            10_same_no_1_1_1b 
##                          257                          208 
##            10_same_no_3_1_1a            10_same_no_3_1_1b 
##                          215                          196 
##           10_same_no_NA_NA_2         10_same_yes_1_0.5_1a 
##                          201                          204 
##         10_same_yes_1_0.5_1b          30_same_no_1_0.5_1a 
##                          214                          230 
##          30_same_no_1_0.5_1b 
##                          239
#get means and sds per setting
mses <- all %>%
  group_by(K, ns, cov_shift, study_sd, study_inter_sd, scenario, honesty, sc_combo) %>%
  summarise(across(x_nostudy:ma, list(mean=mean, sd=sd)),
            n_iter = n())
#make long data for plotting, just with means
mses_long <- all %>%
  group_by(K, ns, cov_shift, study_sd, study_inter_sd, scenario, honesty, sc_combo) %>%
  summarise(across(x_nostudy:ma, mean),
            n_iter = n()) %>%
  pivot_longer(cols=x_nostudy:ma,
               names_to="Method",
               values_to="MSE") %>%
  mutate(MSE = as.numeric(MSE),
         base = case_when(grepl("x_",Method)==T ~ "X-Learner",
                          grepl("s_",Method)==T ~ "S-Learner",
                          grepl("causal_",Method)==T ~ "Causal Forest",
                          grepl("ma",Method)==T ~ "Meta-Analysis"),
         ensemble = case_when(grepl("nostudy",Method)==T ~ "Complete Pooling",
                              grepl("studyind",Method)==T ~ "Trial Indicator",
                              grepl("tree", Method)==T ~ "Ensemble Tree",
                              grepl("forest",Method)==T ~ "Ensemble Forest",
                              grepl("lasso",Method)==T ~ "Ensemble Lasso",
                              grepl("ss",Method)==T ~ "Single-Study",
                              grepl("ma",Method)==T ~ "Meta-Analysis"),
         base = factor(base, levels=c("S-Learner", "X-Learner", "Causal Forest", "Meta-Analysis")), 
         ensemble = factor(ensemble, levels=c("Single-Study", "Complete Pooling", "Trial Indicator", "Ensemble Tree",
                                              "Ensemble Forest","Ensemble Lasso", "Meta-Analysis")),
         sds = paste(study_sd, study_inter_sd, sep=", "), sds = factor(sds),
         scenario = ifelse(scenario=="1a", "Piecewise Linear CATE",
                           ifelse(scenario=="1b", "Non-linear CATE",
                                  "Variable CATE")),
         scenario = factor(scenario, levels=c("Piecewise Linear CATE",
                                              "Non-linear CATE",
                                              "Variable CATE")))

Plots

First consider the main settings.

#Figure 1
mses_long %>%
  filter(cov_shift == "no", K==10, ns=="same") %>%
  ggplot(aes(x=sds, y=MSE, group=Method, color=ensemble)) +
  geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
  #geom_line() +
  facet_wrap(~scenario, scales='free') +
  scale_x_discrete(labels = c("Low-Low","Med-Low", "Med-Med", "Med-High","High-High")) +
  scale_y_continuous(limits = c(0, 2.1)) +
  labs(shape="Single-Study Method", color="Aggregation Method") +
  guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
  #theme(axis.text.x = element_text(angle = 45)) +
  xlab("SD of Study Main and Study Interaction Coefficients") +
  theme(text = element_text(size=12))
## Warning: Removed 6 rows containing missing values (geom_point).

#ggsave("Plots/MLSims_Fig1_14Feb2023.jpeg",width=14,height=5,units="in")
mses_long %>%
  filter(cov_shift == "no", K==10, sds=="1, 0.5") %>%
  ggplot(aes(x=ns, y=MSE, group=Method, color=ensemble)) +
  geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
  #geom_line() +
  facet_wrap(~scenario, scales='free') +
  scale_y_continuous(limits = c(0, 2.1)) +
  labs(shape="Single-Study Method", color="Aggregation Method") +
  guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
  #theme(axis.text.x = element_text(angle = 45)) +
  xlab("Trial Sizes") +
  theme(text = element_text(size=12))

mses_long %>%
  filter(K==30) %>%
  ggplot(aes(x=scenario, y=MSE, group=Method, color=ensemble)) +
  geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
  #geom_line() +
  scale_y_continuous(limits = c(0, 2.1)) +
  labs(shape="Single-Study Method", color="Aggregation Method") +
  guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
  #theme(axis.text.x = element_text(angle = 45)) +
  xlab("Scenario") + ggtitle("K=30 Trials") +
  theme(text = element_text(size=12))

mses_long %>%
  filter(cov_shift == "yes") %>%
  ggplot(aes(x=ns, y=MSE, group=Method, color=ensemble)) +
  geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
  #geom_line() +
  facet_wrap(~scenario) +
  labs(shape="Single-Study Method", color="Aggregation Method") +
  guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
  #theme(axis.text.x = element_text(angle = 45)) +
  xlab("Trial Sizes") + ggtitle("Covariate Shift") +
  theme(text = element_text(size=12))

Boxplots

#Figure 1 boxplot option
all_long <- all %>%
  pivot_longer(cols=x_nostudy:ma,
               names_to="Method",
               values_to="MSE") %>%
  mutate(MSE = as.numeric(MSE),
         base = case_when(grepl("x_",Method)==T ~ "X-Learner",
                          grepl("s_",Method)==T ~ "S-Learner",
                          grepl("causal_",Method)==T ~ "Causal Forest",
                          grepl("ma",Method)==T ~ "Meta-Analysis"),
         ensemble = case_when(grepl("nostudy",Method)==T ~ "Complete Pooling",
                              grepl("studyind",Method)==T ~ "Trial Indicator",
                              grepl("tree", Method)==T ~ "Ensemble Tree",
                              grepl("forest",Method)==T ~ "Ensemble Forest",
                              grepl("lasso",Method)==T ~ "Ensemble Lasso",
                              grepl("ss",Method)==T ~ "Single-Study",
                              grepl("ma",Method)==T ~ "Meta-Analysis"),
         base = factor(base, levels=c("S-Learner", "X-Learner", "Causal Forest", "Meta-Analysis")), 
         ensemble = factor(ensemble, levels=c("Complete Pooling", "Single-Study", "Trial Indicator", "Ensemble Tree",
                                              "Ensemble Forest","Ensemble Lasso", "Meta-Analysis")),
         sds = factor(paste(study_sd, study_inter_sd, sep=", ")),
         scenario = ifelse(scenario=="1a", "Piecewise Linear CATE",
                           ifelse(scenario=="1b", "Non-linear CATE",
                                  "Variable CATE")),
         scenario = factor(scenario, levels=c("Piecewise Linear CATE",
                                              "Non-linear CATE",
                                              "Variable CATE")))

First let’s look at our original setups.

#removing scenario 2 (variable CATE)
all_long %>%
  filter(cov_shift == "no", ns=="same", K==10, scenario != "Variable CATE") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(sds~scenario, scales='free') +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

#removing also high variability and complete pooling
all_long %>%
  filter(cov_shift == "no", ns=="same", K==10, scenario != "Variable CATE",
         ensemble != "Complete Pooling", sds != "3, 1") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(sds~scenario) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

all_long %>%
  filter(cov_shift == "no", K==10, sds=="1, 0.5") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(ns~scenario, scales='free') +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

#remove complete pooling
all_long %>%
  filter(cov_shift == "no", K==10, sds=="1, 0.5", ensemble != "Complete Pooling") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(ns~scenario, scales='free') +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

#switch x-axis
all_long %>%
  filter(cov_shift == "no", K==10, sds=="1, 0.5", ensemble != "Complete Pooling") %>%
  ggplot(aes(x=base, y=MSE)) +
  geom_boxplot(aes(color=ensemble)) +
  facet_grid(ns~scenario) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Aggregation Method") + xlab("Single-Study Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

Now consider when K=30.

all_long %>%
  filter(K==30) %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_wrap(~scenario) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  ggtitle("K=30") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12))

Now consider covariate shift.

all_long %>%
  filter(cov_shift=="yes") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(ns~scenario) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ggtitle("Covariate Shift")

#remove complete pooling
all_long %>%
  filter(cov_shift=="yes", ensemble != "Complete Pooling") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_grid(ns~scenario) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ggtitle("Covariate Shift")

Finally, consider scenario 2.

all_long %>%
  filter(scenario == "Variable CATE") %>%
  ggplot(aes(x=ensemble, y=MSE)) +
  geom_boxplot(aes(color=base)) +
  facet_wrap(~ns) +
  #scale_y_continuous(limits = c(0, 2.1)) +
  labs(color="Single-Study Method") + xlab("Aggregation Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ggtitle("Variable CATE")

#Figure 2
mses_long %>%
  group_by(Method, ensemble, base) %>%
  summarise(MSE=mean(MSE)) %>%
  ggplot(aes(x=ensemble, y=MSE, group=1, color=base)) +
  geom_point(size=5) +
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1),
        plot.margin=margin(10,10,10,30),
        text = element_text(size=15)) +
  labs(color="Single-Study Approach") +
  xlab("Aggregation Approach")
## `summarise()` has grouped output by 'Method', 'ensemble'. You can override
## using the `.groups` argument.

#ggsave("Plots/MLSims_Fig2_14Feb2023.jpeg",width=8,height=5,units="in")
## anova
mod_params <- lm(MSE ~ factor(base) + factor(ensemble) + factor(study_sd) + 
                   factor(study_inter_sd) + factor(scenario) +
                   factor(base)*factor(ensemble), 
                 data=filter(means_long, base != "Meta-Analysis", scenario != 2))
summary(mod_params)
anova(mod_params)

anov <- aov(MSE ~ factor(base) + factor(ensemble) + factor(study_sd) + 
                   factor(study_inter_sd) + factor(scenario) +
                   factor(base)*factor(ensemble), 
                 data=filter(means_long, base != "Meta-Analysis", scenario != 2))

TukeyHSD(anov, 'factor(base)', conf.level=0.95)
TukeyHSD(anov, 'factor(ensemble)', conf.level=0.95)
TukeyHSD(anov, 'factor(study_sd)', conf.level=0.95)
TukeyHSD(anov, 'factor(study_inter_sd)', conf.level=0.95)
TukeyHSD(anov, 'factor(base):factor(ensemble)', conf.level=0.95)

Table

#results table
tab <- sd_mses %>%
  mutate(sc_combo = paste(scenario, study_sd, study_inter_sd, sep="_"),
         sc_combo = factor(sc_combo)) %>%
  select(s_nostudy, x_nostudy, causal_nostudy, s_studyind, x_studyind, causal_studyind,
         s_tree, x_tree, causal_tree, s_forest, x_forest, causal_forest,
         s_lasso, x_lasso, causal_lasso, ma) %>%
  t()
colnames(tab) <- c("Low-Low","Medium-Low","Medium-Medium",
                   "Medium-High","High-High","Low-Low",
                   "Medium-Low","Medium-Medium","Medium-High",
                   "High-High","")
rownames(tab) <- c("S - Pool","X - Pool", "CF - Pool", "S - Indicator",
                   "X - Indicator", "CF - Indicator", "S - Tree", "X - Tree",
                   "CF - Tree", "S - Forest", "X - Forest", "CF - Forest",
                   "S - Lasso", "X - Lasso", "CF - Lasso", "Meta-Analysis")

#print(tab)

rows <- c()
for (j in 1:nrow(tab)) {
  res <- rownames(tab)[j]
  for (i in 1:ncol(tab)) {
    res <- paste(res, tab[j,i], sep=" & ")
  }
  rows[j] <- res
}
rows

# library(knitr)
# library(kableExtra)
# kable(tab,"html") %>%
#   column_spec(1:11, border_right = T) %>%
#   column_spec(12, width = "30em") %>%
#   add_header_above(c("","Scenario 1a"=5, "Scenario 1b"=5, "Scenario 2"=1)) %>%
#   kable_styling()